# 预处理，pipline

import pandas as pd
import numpy as np
import jieba
import re
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
from tensorflow import keras
from nlu_model.util.pkl_impl import save_pkl, load_pkl
from nlu_model.ptm.word2vector import Word2vector
from nlu_model.cls.model.cls_model import ClsModel

# data preprocess
def loadfile():
    # 加载并预处理模型
    neg = pd.read_excel('./data/cls/shopping_reviews/neg.xls', header=None, index=None)
    pos = pd.read_excel('./data/cls/shopping_reviews/pos.xls', header=None, index=None)

    def cw(x): 
        punctuation = r"[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）：]+"
        x = re.sub(punctuation, "", x)

        return list(jieba.cut(x))
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)

    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

    x_train, x_test, y_train, y_test = train_test_split(
        np.concatenate((pos['words'], neg['words'])), y, test_size=0.2,random_state=666)
    
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = loadfile()
with open("./data/cls/shopping_reviews/train.txt", "w") as f:
    for idx in range(len(x_train)):
        f.write("%s\t%s\n" % (y_train[idx], " ".join(x_train[idx])))

with open("./data/cls/shopping_reviews/test.txt", "w") as f:
    for idx in range(len(x_test)):
        f.write("%s\t%s\n" % (y_test[idx], " ".join(x_test[idx])))

word2vector = Word2vector()

word2vector.load("./data/ptm/shopping_reviews/w2v_word2idx2020100601.pkl",
                 "./data/ptm/shopping_reviews/w2v_model_metric_2020100601.pkl", 
                 "./data/ptm/shopping_reviews/w2v_model_conf_2020100601.pkl")

# word2vector.load("./data/ptm/weibo_w2v_300/w2v_word2idx2020101701.pkl",
#           "./data/ptm/weibo_w2v_300/w2v_model_metric_2020101701.pkl", 
#           "./data/ptm/weibo_w2v_300/w2v_model_conf_2020101701.pkl")

x_train = word2vector.batch2idx(x_train)
x_test = word2vector.batch2idx(x_test)


# data_preprocess
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                    value=0,
                                                    padding='post',
                                                    maxlen=50)

x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                    value=0,
                                                    padding='post',
                                                    maxlen=50)

model_conf = {"MAX_LEN": 50,
              "w2c_len": 300, 
              "emb_model": word2vector}
train_conf = {"batch_size": 64,
              "epochs": 1, 
              "verbose": 1}
cls_model = ClsModel("textcnn_small", model_conf, train_conf)
cls_model.fit(x_train, y_train, x_test, y_test)
print(cls_model.evaluate(x_test, y_test))
cls_model.save("./data/cls/shopping_reviews/model_20201007")


sentence = "这台手机真性能还挺好的"
print(cls_model.predict([sentence]))


sentence = "这台手机真性能还挺好的"
word2vector = Word2vector()
cls_model = ClsModel("load", model_conf={"path":"./data/cls/shopping_reviews/model_20201007", "emb_model": word2vector}, train_conf={})
print(cls_model.predict([sentence]))

